0.1 Optimal number of cluster

To find the optimal number of clusters, we will use the silhouette method and compute the average silhouette score as a function of the number of clusters; we will also explore the impact of varying the ratio between the symptoms and the tracking distances (par$r).

load(paste0(IO$tmp_data,"d_wide_imputed.Rdata"), verbose = TRUE)
## Loading objects:
##   d_wide
load(paste0(IO$tmp_data,"d_imputed.Rdata"), verbose = TRUE)
## Loading objects:
##   d
#cycle_ids = unique(d$cycle_id_m)

#sample_size= 100000
#if(nrow(d_wide)>sample_size){d_wide_sample = d_wide[sample(1:nrow(d_wide),sample_size),]}else{d_wide_sample = d_wide}


n_centers = 2:7
df = data.frame()
tic()
    for(r in c(0,1/2,2/3,0.75,5/6,1)){
      cat(r,"\n")
      for(n_center in n_centers){
        cat("\t",n_center,"\n")
        
        clara_n = clara(d_wide[,-1], k = n_center, 
                        metric = "custom_jaccard", 
                        w = par$w, cjratio = r, 
                        samples = par$clara$samples, sampsize = par$clara$sampsize)
        
        cat("\t\t",clara_n$silinfo$avg.width,"\n")
        df = rbind(df,data.frame(n_clust = n_center, r = r, avg_silhouette_score = clara_n$silinfo$avg.width))
        
        silh = as.data.frame(clara_n$silinfo$widths)
        silh$x = 1:nrow(silh)
        silh$rows = as.numeric(rownames(silh))
        
        d$cluster_num = clara_n$clustering[match(d$cycle_id_m, d_wide$cycle_id_m)]
        
        # show the samples chosen by clara FOR SILHOUETTE
        m = match(silh$rows, rownames(d_wide))
        cycle_ids_silh = as.character(d_wide$cycle_id_m[m])
        m = which(!is.na(match( d$cycle_id_m, cycle_ids_silh)))
        d_subset = d[m,]
        d_subset = d_subset[order(d_subset$cluster_num),]
        d_subset$cycle_id_m = factor(d_subset$cycle_id_m, levels = rev(cycle_ids_silh))
        g_samples = ggplot_imputed_TB(d_subset, facet_grid = NULL)+scale_size(range = c(2,2)) + ggtitle(paste0("r = ",r))
        
        g_silh = ggplot(silh, aes(fill = factor(cluster),x = factor(x, levels =rev(x)), y = sil_width)) + coord_flip()+
          geom_bar(stat = "identity")+
          geom_point(aes( col = factor(neighbor),y = pmax(0,sil_width) + 0.025), size = 3)+
          #ylim(c(-1,1))+
          geom_hline(yintercept = clara_n$silinfo$avg.width, linetype = 2)+ 
          ggtitle(paste0("avg. silh score = ",round(clara_n$silinfo$avg.width,digits = 3)))+
          guides(fill = FALSE, color = FALSE)+
          xlab("")+ylab("silh width")+ theme(axis.text.y = element_blank())
        
        
        mds = data.frame(cmdscale(d = clara_n$diss, k = 2))
        colnames(mds) = c("x","y")
        mds$cluster_num = as.factor(clara_n$clustering[match(rownames(mds), names(clara_n$clustering))])
        m = match( silh$rows, rownames(mds))
        mds = mds[m,]
        mds$num = 1:nrow(mds)
        g_mds = ggplot(mds, aes(x = x, y = y, col = cluster_num)) + coord_fixed()+
          geom_point(size = 4, alpha = 0.5) + 
          geom_text(label = 1:nrow(mds))+
          guides(color = FALSE)+ggtitle("MDS")+xlab("")+ylab("")
        
        grid.arrange(g_samples, g_silh, g_mds, nrow = 1, widths = c(2,1,2.2))
      }
    }
## 0 
##   2 
##       0.4728137

##   3 
##       0.4357696

##   4 
##       0.4029847

##   5 
##       0.4247842

##   6 
##       0.3458469

##   7 
##       0.2600311

## 0.5 
##   2 
##       0.2143314

##   3 
##       0.05018665

##   4 
##       0.07942411

##   5 
##       0.0007393174

##   6 
##       0.01058468

##   7 
##       0.0153732

## 0.6666667 
##   2 
##       0.1417689

##   3 
##       0.06625149

##   4 
##       0.05116242

##   5 
##       -0.009741934

##   6 
##       0.003536232

##   7 
##       0.007290257

## 0.75 
##   2 
##       0.06334959

##   3 
##       0.0448804

##   4 
##       0.02009475

##   5 
##       0.003845095

##   6 
##       0.02612914

##   7 
##       0.0006769837

## 0.8333333 
##   2 
##       0.06996795

##   3 
##       0.02783658

##   4 
##       0.004688544

##   5 
##       0.003934847

##   6 
##       0.0002179191

##   7 
##       0.004063646

## 1 
##   2 
##       0.06335609

##   3 
##       0.01308502

##   4 
##       -0.01997871

##   5 
##       -0.02490028

##   6 
##       -0.0490737

##   7 
##       -0.01502688

toc()
## 60.306 sec elapsed
save(df, file = paste0(IO$tmp_Rdata,"silhouette_score_summary.Rdata"))

rm(d_subset, n_center, n_centers)

0.1.1 Average silhouette score per number of clusters

df$n_clusters = factor(df$n_clust)
ggplot(df, aes(x = n_clusters, y = avg_silhouette_score, fill = avg_silhouette_score)) + geom_bar(stat = "identity")+ facet_grid(r ~ .)